# -------------------------------------------------------------------
# Purpose: Creates Table B51
# Author:  Max Posch, 25/07/2025
# Usage:   Source this script to generate the table.
# -------------------------------------------------------------------
# Check that required paths exist
stopifnot(dir.exists(pdataconfinterim))
stopifnot(dir.exists(poutputappendix))


## Load surname-origin-country crosswalk ----------------------------------------------------
load(file.path(pdataconfinterim, "geneticDistanceNameLevelMetaphone.RData"))
crosswalk <- dist_namelast_mp_max[, .(namelast_mp, coo_mp = cob)]


## load 1940 census -----------
my_vars <- c("year", "statefip", "countyicp", "region", "relate", "namelast", "age", "sex", "race", "nativity", "bpl", "occ1950", "ind1950", "higrade", "incwage", "occscore", "mtongue")
cen1940 <- fread(
    "/homes/data/census-ipums/current/csv/1940.csv",
    select = my_vars
)
cen1940[, gisjoin := as.character(countyicp)]
cen1940[str_length(gisjoin) == 2, gisjoin := str_c("00", gisjoin)]
cen1940[str_length(gisjoin) == 3, gisjoin := str_c("0", gisjoin)]
cen1940[, gisjoin := str_c(as.character(statefip * 10), gisjoin)]
cen1940[str_length(gisjoin) == 6, gisjoin := str_c("0", gisjoin)]
cen1940[, gisjoin := str_c("G", gisjoin)]


# remove characters that are not A-Z
cen1940[, namelast := stri_trans_general(namelast, "latin-ascii")] # nolint
cen1940[, namelast := str_to_upper(namelast)]
cen1940[, namelast := gsub("[^A-Z]*", "", namelast, perl = TRUE)]
cen1940 <- cen1940[str_length(namelast) > 1]


# group phonetically similar names
cen1940[, namelast_mp := metaphone(namelast)]
cen1940[namelast_mp == "", namelast_mp := NA_character_]
cen1940 <- cen1940[!is.na(namelast_mp)]

cen1940[, namelast_mp := factor(namelast_mp)]
cen1940[, gisjoin := factor(gisjoin)]
cen1940[, statefip := factor(statefip)]


## load 1880 census -----------
my_vars <- c("year", "statefip", "countyicp", "namelast", "age", "sex", "race", "bpl", "occ1950", "occscore")
cen1880 <- fread(
    "/homes/data/census-ipums/current/csv/1880.csv",
    select = my_vars
)
cen1880[, gisjoin := as.character(countyicp)]
cen1880[str_length(gisjoin) == 2, gisjoin := str_c("00", gisjoin)]
cen1880[str_length(gisjoin) == 3, gisjoin := str_c("0", gisjoin)]
cen1880[, gisjoin := str_c(as.character(statefip * 10), gisjoin)]
cen1880[str_length(gisjoin) == 6, gisjoin := str_c("0", gisjoin)]
cen1880[, gisjoin := str_c("G", gisjoin)]


# remove characters that are not A-Z
cen1880[, namelast := stri_trans_general(namelast, "latin-ascii")] # nolint
cen1880[, namelast := str_to_upper(namelast)]
cen1880[, namelast := gsub("[^A-Z]*", "", namelast, perl = TRUE)]
cen1880 <- cen1880[str_length(namelast) > 1]


# group phonetically similar names
cen1880[, namelast_mp := metaphone(namelast)]
cen1880[namelast_mp == "", namelast_mp := NA_character_]
cen1880 <- cen1880[!is.na(namelast_mp)]

cen1880[, namelast_mp := factor(namelast_mp)]
cen1880[, gisjoin := factor(gisjoin)]
cen1880[, statefip := factor(statefip)]


## load patents -----------
f <- file.path(pdataconfinterim, "cusp_patents.csv")
patents <- fread(f)
patents_df <- patents[data.table::between(iyear, 1880, 1889) | data.table::between(iyear, 1940, 1949), .(patnum, iyear, main_patentclass)]
setkey(patents_df, patnum)


## add inventor surnames -----------
f <- file.path(pdataconfinterim, "cusp_patents_inventor_surnames_fips.csv")
inventor_surnames <- fread(f)
setkey(inventor_surnames, patnum)
patents_df <- merge(patents_df, inventor_surnames)


# group phonetically similar names -----------
patents_df[, namelast_mp := metaphone(namelast)]
patents_df[namelast_mp == "", namelast_mp := NA_character_]
patents_df <- patents_df[!is.na(namelast_mp)]

# weighted patents if multiple patentees -----------
patents_df[, weight := 1 / .N, by = patnum]


# add surname-origin-country crosswalk -----------
patents_df <- merge(patents_df, crosswalk, by = "namelast_mp", all.x = TRUE)
patents_df[is.na(coo_mp), coo_mp := 1] # set unknown origin to USA


## yschool 1940 -----------
temp <- copy(cen1940)
temp <- temp %>%
    .[, cob := floor(bpl / 100)] %>%
    .[cob < 100, cob := 1] %>%
    .[higrade == 999, higrade := NA_real_] %>%
    .[, higrade := floor(higrade / 10)] %>% 
    .[, yschool := higrade - 3] %>% 
    .[yschool < 0, yschool := 0] %>% 
    drop_na(cob, bpl, gisjoin, age, yschool) %>%
    select(namelast_mp, yschool, cob, bpl, gisjoin, age)


# link crosswalk
temp <- merge(temp, crosswalk, by = "namelast_mp", all.x = TRUE)
temp[is.na(coo_mp), coo_mp := 1] # set unknown origin to USA


# names
temp1 <- temp[, .N, keyby = .(namelast_mp, yschool)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col0n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col0n <- as.numeric(col0n)


# cob
temp1 <- temp[, .N, keyby = .(cob, yschool)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(cob)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(cob)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col0n <- c(col0n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# coo_mp
temp1 <- temp[, .N, keyby = .(coo_mp, yschool)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(coo_mp)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(coo_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col0n <- c(col0n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# counties
temp1 <- temp[, .N, keyby = .(gisjoin, yschool)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(gisjoin)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col0n <- c(col0n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


## occ1950 1880 -----------
temp <- copy(cen1880)
temp <- temp %>% 
    .[, cob := floor(bpl / 100)] %>% 
    .[cob < 100, cob := 1] %>% 
    .[occ1950 >= 979, occ1950 := NA_real_] %>% 
    drop_na(cob, bpl, gisjoin, age, occ1950) %>% 
    select(namelast_mp, occ1950, cob, bpl, gisjoin, age)


# link crosswalk
temp <- merge(temp, crosswalk, by = "namelast_mp", all.x = TRUE)
temp[is.na(coo_mp), coo_mp := 1] # set unknown origin to USA


# names
temp1 <- temp[, .N, keyby = .(namelast_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col1n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col1n <- as.numeric(col1n)


# cob
temp1 <- temp[, .N, keyby = .(cob, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(cob)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(cob)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col1n <- c(col1n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# coo_mp
temp1 <- temp[, .N, keyby = .(coo_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(coo_mp)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(coo_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col1n <- c(col1n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# counties
temp1 <- temp[, .N, keyby = .(gisjoin, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(gisjoin)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col1n <- c(col1n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


## occ1950 1940 -----------
temp <- copy(cen1940)
temp <- temp %>%
    .[, cob := floor(bpl / 100)] %>%
    .[cob < 100, cob := 1] %>%
    .[occ1950 >= 979, occ1950 := NA_real_] %>%
    drop_na(cob, bpl, gisjoin, age, occ1950) %>%
    select(namelast_mp, occ1950, cob, bpl, gisjoin, age)


# link crosswalk
temp <- merge(temp, crosswalk, by = "namelast_mp", all.x = TRUE)
temp[is.na(coo_mp), coo_mp := 1] # set unknown origin to USA


# names
temp1 <- temp[, .N, keyby = .(namelast_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col2n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col2n <- as.numeric(col2n)


# cob
temp1 <- temp[, .N, keyby = .(cob, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(cob)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(cob)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col2n <- c(col2n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# coo_mp
temp1 <- temp[, .N, keyby = .(coo_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(coo_mp)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(coo_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col2n <- c(col2n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# counties
temp1 <- temp[, .N, keyby = .(gisjoin, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(gisjoin)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col2n <- c(col2n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


## occ1950 immigrants 1880 -----------
temp <- copy(cen1880)
temp <- temp %>%
    .[, cob := floor(bpl / 100)] %>%
    .[cob < 100, cob := 1] %>%
    .[occ1950 >= 979, occ1950 := NA_real_] %>%
    drop_na(cob, bpl, gisjoin, age, occ1950) %>%
    .[cob >= 100] %>%
    select(namelast_mp, occ1950, cob, bpl, gisjoin, age)


# link crosswalk
temp <- merge(temp, crosswalk, by = "namelast_mp", all.x = TRUE)
temp[is.na(coo_mp), coo_mp := 1] # set unknown origin to USA


# names
temp1 <- temp[, .N, keyby = .(namelast_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col3n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col3n <- as.numeric(col3n)


# cob
temp1 <- temp[, .N, keyby = .(cob, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(cob)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(cob)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col3n <- c(col3n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# coo_mp
temp1 <- temp[, .N, keyby = .(coo_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(coo_mp)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(coo_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col3n <- c(col3n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# counties
temp1 <- temp[, .N, keyby = .(gisjoin, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(gisjoin)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col3n <- c(col3n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))



## occ1950 immigrants 1940 -----------
temp <- copy(cen1940)
temp <- temp %>%
    .[, cob := floor(bpl / 100)] %>%
    .[cob < 100, cob := 1] %>%
    # .[occscore == 0, occscore := NA_real_] %>%
    .[occ1950 >= 979, occ1950 := NA_real_] %>%
    drop_na(cob, bpl, gisjoin, age, occ1950) %>%
    .[cob >= 100] %>%
    select(namelast_mp, occ1950, cob, bpl, gisjoin, age)


# link crosswalk
temp <- merge(temp, crosswalk, by = "namelast_mp", all.x = TRUE)
temp[is.na(coo_mp), coo_mp := 1] # set unknown origin to USA


# names
temp1 <- temp[, .N, keyby = .(namelast_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col4n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col4n <- as.numeric(col4n)


# cob
temp1 <- temp[, .N, keyby = .(cob, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(cob)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(cob)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col4n <- c(col4n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# coo_mp
temp1 <- temp[, .N, keyby = .(coo_mp, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(coo_mp)]
temp1 <- temp1[, .(
    n_occ = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(coo_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col4n <- c(col4n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# counties
temp1 <- temp[, .N, keyby = .(gisjoin, occ1950)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(gisjoin)]
temp1 <- temp1[, .(
  n_occ = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_occ > 1, hhi_norm := (hhi - 1 / n_occ) / (1 - 1 / n_occ)]
temp1 <- temp1[, -c("n_occ")]

cols <- c("hhi_norm")
col4n <- c(col4n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


## cob 1880 -----------
temp <- copy(cen1880)
temp <- temp %>%
    .[, cob := floor(bpl / 100)] %>% 
    .[cob < 100, cob := 1] %>% 
    .[cob >= 100] %>%
    drop_na(cob, gisjoin) %>%
    select(namelast_mp, cob, gisjoin)


# names 
temp1 <- temp[, .N, keyby = .(namelast_mp, cob)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_cob = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_cob > 1, hhi_norm := (hhi - 1 / n_cob) / (1 - 1 / n_cob)]
temp1 <- temp1[, -c("n_cob")]

cols <- c("hhi_norm")
col5n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col5n <- as.numeric(col5n)


# counties 
temp1 <- temp[, .N, keyby = .(gisjoin, cob)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(gisjoin)]
temp1 <- temp1[, .(
    n_cob = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_cob > 1, hhi_norm := (hhi - 1 / n_cob) / (1 - 1 / n_cob)]
temp1 <- temp1[, -c("n_cob")]

cols <- c("hhi_norm")
col5n <- c(col5n, NA, NA, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


## cob 1940 -----------
temp <- copy(cen1940)
temp <- temp %>%
    .[, cob := floor(bpl / 100)] %>% 
    .[cob < 100, cob := 1] %>% 
    .[cob >= 100] %>%
    drop_na(cob, gisjoin) %>%
    select(namelast_mp, cob, gisjoin)


# names 
temp1 <- temp[, .N, keyby = .(namelast_mp, cob)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_cob = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_cob > 1, hhi_norm := (hhi - 1 / n_cob) / (1 - 1 / n_cob)]
temp1 <- temp1[, -c("n_cob")]

cols <- c("hhi_norm")
col6n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col6n <- as.numeric(col6n)


# counties 
temp1 <- temp[, .N, keyby = .(gisjoin, cob)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(gisjoin)]
temp1 <- temp1[, .(
    n_cob = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_cob > 1, hhi_norm := (hhi - 1 / n_cob) / (1 - 1 / n_cob)]
temp1 <- temp1[, -c("n_cob")]

cols <- c("hhi_norm")
col6n <- c(col6n, NA, NA, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


## bpld Germans 1880 nested ----
temp <- copy(cen1880)
temp <- temp %>%
    .[data.table::between(bpl, 45301, 45362) ] %>% 
    drop_na(bpl, gisjoin) %>%
    select(namelast_mp, bpl, gisjoin)


# names
temp1 <- temp[, .N, keyby = .(namelast_mp, bpl)]

temp1 <- temp1[, ":="(
    p = N / sum(N)),
keyby = .(namelast_mp)
]

temp1 <- temp1[, .(
    n_bpl = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_bpl > 1, hhi_norm := (hhi - 1 / n_bpl) / (1 - 1 / n_bpl)]
temp1 <- temp1[, -c("n_bpl")]

cols <- c("hhi_norm")
col7n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col7n <- as.numeric(col7n)


# counties
temp1 <- temp[, .N, keyby = .(gisjoin, bpl)]

temp1 <- temp1[, ":="(
    p = N / sum(N)),
keyby = .(gisjoin)
]
temp1 <- temp1[, .(
    n_bpl = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(gisjoin)
]
temp1[, hhi_norm := hhi]
temp1[n_bpl > 1, hhi_norm := (hhi - 1 / n_bpl) / (1 - 1 / n_bpl)]
temp1 <- temp1[, -c("n_bpl")]

cols <- c("hhi_norm")
col7n <- c(col7n, NA, NA, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))



## tech classes 1880-9 -----------
# names 
temp1 <- patents_df[data.table::between(iyear, 1880, 1889), .(N = sum(weight)), keyby = .(namelast_mp, main_patentclass)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]

temp1 <- temp1[, .(
    n_main_patentclass = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_main_patentclass > 1, hhi_norm := (hhi - 1 / n_main_patentclass) / (1 - 1 / n_main_patentclass)]
temp1 <- temp1[, -c("n_main_patentclass")]

cols <- c("hhi_norm")
col8n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col8n <- as.numeric(col8n)


# coo_mp
temp1 <- patents_df[data.table::between(iyear, 1880, 1889), .(N = sum(weight)), keyby = .(coo_mp, main_patentclass)]
temp1 <- temp1[, ":="(
    p = N / sum(N)),
keyby = .(coo_mp)
]
temp1 <- temp1[, .(
    n_main_patentclass = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(coo_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_main_patentclass > 1, hhi_norm := (hhi - 1 / n_main_patentclass) / (1 - 1 / n_main_patentclass)]
temp1 <- temp1[, -c("n_main_patentclass")]

cols <- c("hhi_norm")
col8n <- c(col8n, NA,  as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# counties
temp1 <- patents_df[data.table::between(iyear, 1880, 1889), .(N = sum(weight)), keyby = .(inv_fips, main_patentclass)]
temp1 <- temp1[, ":="(
  p = N / sum(N)),
keyby = .(inv_fips)
]
temp1 <- temp1[, .(
  n_main_patentclass = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(inv_fips)
]
temp1[, hhi_norm := hhi]
temp1[n_main_patentclass > 1, hhi_norm := (hhi - 1 / n_main_patentclass) / (1 - 1 / n_main_patentclass)]
temp1 <- temp1[, -c("n_main_patentclass")]

cols <- c("hhi_norm")
col8n <- c(col8n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


## tech classes 1940-9 -----------
temp1 <- patents_df[data.table::between(iyear, 1940, 1949), .(N = sum(weight)), keyby = .(namelast_mp, main_patentclass)]
temp1 <- temp1[, ":="(p = N / sum(N)), keyby = .(namelast_mp)]
temp1 <- temp1[, .(
    n_main_patentclass = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(namelast_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_main_patentclass > 1, hhi_norm := (hhi - 1 / n_main_patentclass) / (1 - 1 / n_main_patentclass)]
temp1 <- temp1[, -c("n_main_patentclass")]

cols <- c("hhi_norm")
col9n <- temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]
col9n <- as.numeric(col9n)


# coo_mp
temp1 <- patents_df[data.table::between(iyear, 1940, 1949), .(N = sum(weight)), keyby = .(coo_mp, main_patentclass)]
temp1 <- temp1[, ":="(
    p = N / sum(N)),
keyby = .(coo_mp)
]
temp1 <- temp1[, .(
    n_main_patentclass = .N,
    N = sum(N),
    hhi = sum(p^2)
),
keyby = .(coo_mp)
]
temp1[, hhi_norm := hhi]
temp1[n_main_patentclass > 1, hhi_norm := (hhi - 1 / n_main_patentclass) / (1 - 1 / n_main_patentclass)]
temp1 <- temp1[, -c("n_main_patentclass")]

cols <- c("hhi_norm")
col9n <- c(col9n, NA, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))


# counties
temp1 <- patents_df[data.table::between(iyear, 1940, 1949), .(N = sum(weight)), keyby = .(inv_fips, main_patentclass)]
temp1 <- temp1[, ":="(
  p = N / sum(N)),
keyby = .(inv_fips)
]
temp1 <- temp1[, .(
  n_main_patentclass = .N,
  N = sum(N),
  hhi = sum(p^2)
),
keyby = .(inv_fips)
]
temp1[, hhi_norm := hhi]
temp1[n_main_patentclass > 1, hhi_norm := (hhi - 1 / n_main_patentclass) / (1 - 1 / n_main_patentclass)]
temp1 <- temp1[, -c("n_main_patentclass")]

cols <- c("hhi_norm")
col9n <- c(col9n, as.numeric(temp1[, lapply(.SD, weighted.mean, N), .SDcols = cols]))




# Create table
n <- max(length(col0n), length(col1n), length(col2n), length(col3n), length(col4n), length(col5n), length(col6n), length(col7n), length(col8n), length(col9n))
length(col0n) <- n
length(col1n) <- n
length(col2n) <- n
length(col3n) <- n
length(col4n) <- n
length(col5n) <- n
length(col6n) <- n
length(col7n) <- n
length(col8n) <- n
length(col9n) <- n

tab <- cbind(col0n, col1n, col2n, col3n, col4n, col5n, col6n, col7n, col8n, col9n)
tab <- as_tibble(tab)
tab <- tab %>% mutate(across(everything(), ~ifelse(is.na(.), "", sprintf("%.3f", .))))
rownames(tab) <- rownames <- c("Surname", "Country of birth", "Country of ancestry", "U.S. county of residence")
tablename <- file.path(poutputappendix, "tableB51.tex")
print(xtable(tab, digits = 3), 
      include.rownames = TRUE,
      file = tablename,
      sanitize.text.function = identity)
get_clustering_coefs_rows(tablename)

cat("Table B51 saved to:", tablename, "\n")